readStructure <- function(path) {
k <- as.numeric(sub(".+-(.*)", "\\1", str_remove(basename(path), "\\..*") ))
kpops <- paste0("k", 1:k)
read_lines(path) |>
str_remove("^\\s+") |>
str_remove("\\s+$") |>
str_remove(": ") |>
str_replace_all("\\s+", "\t") |>
as_tibble() |>
separate(col = value, into = c("i1", "i2", "i3", "i4", kpops), sep = "\t") |>
select(i1, starts_with("k")) |>
pivot_longer(names_to = "k-value", values_to = "proportion", starts_with("k")) |>
mutate(
proportion = as.numeric(proportion),
i1 = as.numeric(i1)
)
}
# Function that takes the PCA list we'll generate below and plot the desired PCs
# for the specified population
plotPCA <- function(pca_list, species, pcA, pcB, xstep = 1, ystep = 1, pal = "Paired") {
sp_pca <- pca_list |>
pluck(species)
# x-axis: max/min
xmin <- sp_pca |>
pull(pcA) |>
min(na.rm = TRUE) |>
floor()
xmax <- sp_pca |>
pull(pcA) |>
max(na.rm = TRUE) |>
ceiling()
# y-axis: max/min
ymin <- sp_pca |>
pull(pcB) |>
min(na.rm = TRUE) |>
floor()
ymax <- sp_pca |>
pull(pcB) |>
max(na.rm = TRUE) |>
ceiling()
sp_pca |>
ggplot(aes(x = .data[[pcA]], y = .data[[pcB]], colour = population)) +
geom_point(size = 2) +
scale_color_brewer(palette = pal) +
labs(title = glue::glue("{pcA} vs {pcB}")) +
scale_y_continuous(
limits = c(ymin, ymax),
breaks = seq(ymin, ymax, by = ystep),
labels = as.character(seq(ymin, ymax, by = ystep)),
expand = c(0, 0)
) +
scale_x_continuous(
limits = c(xmin, xmax),
breaks = seq(xmin, xmax, by = xstep),
labels = as.character(seq(xmin, xmax, by = xstep)),
expand = c(0, 0)
) +
theme(
legend.position = "bottom",
legend.title = element_text(hjust = 0.5, face = "bold", size = 14),
legend.title.position = "top",
plot.title = element_text(hjust = 0.5),
title = element_text(face = "bold")
)
}
Here we load the PCA CSV file generated by the
Ipyrad.analysis toolkit.
lst_pca <- fs::dir_ls(here("results", "population-structure", "pca"), glob = "*.csv") |>
(\(x) set_names(x, str_remove(basename(x), "-.*")))() |>
imap(\(x, i) {
tmp <- x |>
read_csv(col_names = TRUE, col_types = cols())
colnames(tmp) <- c("sample", paste0("PC", 1:(ncol(tmp) - 1)))
pops |>
filter(species == i) |>
left_join(tmp)
})
pc_1_2 <- plotPCA(pca_list = lst_pca, species = "ALA", pcA = "PC1", pcB = "PC2", xstep = 2)
pc_1_3 <- plotPCA(pca_list = lst_pca, species = "ALA", pcA = "PC1", pcB = "PC3", xstep = 2)
pc_2_3 <- plotPCA(pca_list = lst_pca, species = "ALA", pcA = "PC2", pcB = "PC3")
design <- "12\n34"
pc_1_2 + pc_1_3 + pc_2_3 + guide_area() +
plot_layout(
design = design,
guides = "collect",
axes = "collect"
) &
guides(colour = guide_legend(ncol = 3, override.aes = list(size = 3)))
pc_1_2 <- plotPCA(pca_list = lst_pca, species = "HMA", pcA = "PC1", pcB = "PC2", xstep = 2, ystep = 2)
pc_1_3 <- plotPCA(pca_list = lst_pca, species = "HMA", pcA = "PC1", pcB = "PC3", xstep = 2, ystep = 2)
pc_2_3 <- plotPCA(pca_list = lst_pca, species = "HMA", pcA = "PC2", pcB = "PC3", ystep = 2)
pc_1_2 + pc_1_3 + pc_2_3 + guide_area() +
plot_layout(
design = design,
guides = "collect",
axes = "collect"
) &
guides(colour = guide_legend(ncol = 3, override.aes = list(size = 4)))
pc_1_2 <- plotPCA(pca_list = lst_pca, species = "HST", pcA = "PC1", pcB = "PC2", xstep = 2, ystep = 2)
pc_1_3 <- plotPCA(pca_list = lst_pca, species = "HST", pcA = "PC1", pcB = "PC3", xstep = 2, ystep = 2)
pc_2_3 <- plotPCA(pca_list = lst_pca, species = "HST", pcA = "PC2", pcB = "PC3", xstep = 2, ystep = 2)
pc_1_2 + pc_1_3 + pc_2_3 + guide_area() +
plot_layout(
design = design,
guides = "collect",
axes = "collect"
) &
guides(colour = guide_legend(ncol = 3, override.aes = list(size = 4)))
Below we plot the STRUCTURE results generated for all
sample in each species. If we wanted to generate a plot with a subset of
the samples (e.g. a specific couple of populations), we’d need to re-run
STRUCTURE with just those samples.
First we load in the STRUCTURE output files using our
custom R function (see above).
pops_structure <- pops |> mutate(i1 = row_number(), .by = species)
structure_df <- fs::dir_ls(
here("results", "population-structure"),
glob = "*clumpp.outfile",
recurse = TRUE
) |>
(\(x) set_names(x, str_remove(basename(x), "\\..*")))() |>
map(readStructure) |>
list_rbind(names_to = "species") |>
separate_wider_delim(delim = "-", names = c("species", "K"), too_many = "merge", cols = species) |>
left_join(pops_structure)
Choosing the best K value can be determined by plotting
the estimated log-probability for each K value, in addition
to the delta-K value. The values can be interpreted as follows:
etable <- fs::dir_ls(here("results"), glob = "*etable.csv", recurse = TRUE) |>
read_csv(col_names = TRUE, col_types = cols(), id = "species") |>
mutate(species = str_remove(basename(species), "-etable.csv")) |>
rename(K = `...1`)
prb <- etable |>
ggplot(aes(x = K, y = estLnProbMean, colour = species)) +
geom_point() +
geom_line() +
scale_y_continuous(
name = "Estimated mean log-probability",
breaks = seq(0, -105000, -10000),
limits = c(-105e3, -4e4),
labels = scales::label_number(style_negative = "minus")
) +
scale_x_continuous(breaks = seq(2, 9, 1))
dk <- etable |>
ggplot(aes(x = K, y = deltaK, colour = species)) +
geom_point() +
geom_line() +
scale_x_continuous(breaks = seq(2, 9, 1)) +
scale_y_continuous(
name = "Delta-K",
breaks = seq(0, 8, 1)
)
prb/dk +
plot_layout(axes = "collect", guides = "collect")
For the three species, with the current data, we can determine the following:
k = 5 and K = 7. The Delta-K for both
of these values is quite high, indicating that both K-values improved
the model fit substantially.K = 5 appears to be
the best choice for H. major. The estimated log-probability is
relatively stable for most values of K, before dipping once
reaching higher values. Delta-K peaks at K = 5, with little
improvement after.K values. Delta-K spikes at
K = 4, indicating that this is likely the best value for
the current dataset.structure_df |>
filter(species == "ALA", K == "K-5") |>
ggplot(aes(x = sample, y = proportion, fill = `k-value`)) +
geom_col(position = "stack") +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(expand = c(0, 0)) +
labs(x = NULL) +
facet_grid(
cols = vars(population), scales = "free_x", space = "free_x"
) +
guides(fill = guide_legend(title.position = "bottom")) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.98),
strip.text = element_text(angle = 90, vjust = 0.5, hjust = 0.1),
strip.background = element_blank(),
legend.position = "bottom",
legend.title = element_text(hjust = 0.5),
panel.spacing.x = unit(0.2, "line")
)
structure_df |>
filter(species == "HMA", K == "K-5") |>
ggplot(aes(x = sample, y = proportion, fill = `k-value`)) +
geom_col(position = "stack") +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(expand = c(0, 0)) +
labs(x = NULL) +
facet_grid(
cols = vars(population), scales = "free_x", space = "free_x"
) +
guides(fill = guide_legend(title.position = "bottom")) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.98),
strip.text = element_text(angle = 90, vjust = 0.5, hjust = 0.1),
strip.background = element_blank(),
legend.position = "bottom",
legend.title = element_text(hjust = 0.5),
panel.spacing.x = unit(0.2, "line")
)
structure_df |>
filter(species == "HST", K == "K-4") |>
ggplot(aes(x = sample, y = proportion, fill = `k-value`)) +
geom_col(position = "stack") +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(expand = c(0, 0)) +
labs(x = NULL) +
facet_grid(
cols = vars(population), scales = "free_x", space = "free_x"
) +
guides(fill = guide_legend(title.position = "bottom")) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.98),
strip.text = element_text(angle = 90, vjust = 0.5, hjust = 0.1),
strip.background = element_blank(),
legend.position = "bottom",
legend.title = element_text(hjust = 0.5),
panel.spacing.x = unit(0.2, "line")
)